Datasets

Raw datasets:

1976-2020-house.csv, StatesAndCyclesData.csv, VoteAndRegByState2020.xlsx, VoteAndRegByStateHistory_prez_reg.xlsx, VoteAndRegByStateHistory_prez_vot.xlsx, VoteAndRegByStateHistory_congress.xlsx. ## Final datasets: analysis_data.csv

Install/load libraries

#install.packages('tidycensus')
#install.packages('position_dodge')
#library(position_dodge)
library(scales)
library(tidycensus)
library(tidyverse)
library(data.table)
library(readxl)
#install.packages('maps')
library(maps)
library(stargazer)
#install.packages('R.utils')
library(ggrepel)
library(R.utils)
library(ggpubr)
#library(arm)
#install.packages("coefplot")
library(coefplot)
#library(dplyr)
library(plotly)

#library(ggcoefstats)


options(tigris_use_cache = TRUE)

Load cleaned redistricting data

This is the dataset which is the output of the data cleaning script.

redist <- read.csv("/Users/Chuyuan/Downloads/analysis_data.csv", stringsAsFactors = FALSE)

Data Cleaning and Merging

We worked with three separate data sets for our research project, which needed to be organized and merged to state-election year pairings for our analysis. Single district states were left in the data to be included for certain visualizations, but they would ultimately be dropped for most of the substantive analysis.

Redistricting Data

This data set began as state-redistricting map pairings, tied to the original census year that maps were based on. A significant complicating factor, however, is that several states have map revisions as a result of court challenges, creating inconsistent numbers of pairings. Fortunately, the data also has columns denoting the years a given map was active, which could be used to associate an individual map with the elections where it was active. Since this data frame would be merged with both of our other data sets, this is also also the step where we added dichotomous dummy variables for use in later analysis.

This was merged in several steps: 1. Filtering out state house and state senate maps, as well as the 2020 congressional maps 2. Filling in blank entries in the “Drawn.by” column with “Institution” 3. Identifying states that have revised maps (specifically FL, NC, OH, PA, RI, TX, VA) 4. Creating an additional data frame for pairing census cycles to election years 5. Doing a full_join on the data frames to create all possible pairings 6. Adding a dummy variable for November 1st of each year as a cutoff point to mark election dates 7. Using the above to create another dummy variable to mark election year-map pairings where a map was inactive and filter them out 8. Adding a dummy variable for which states used more than one redistricting method in the sample window 9. Adding a variety of dichotomous dummy variables to identify different redistricting methods

redistricting_raw <- read.csv("./source_data/StatesAndCyclesData.csv", stringsAsFactors = FALSE)

# 1. Filtering out state house and state senate maps, as well as the 2020 congressional maps
# 2. Filling in blank entries in the "Drawn.by" column with "Institution"
redistricting_raw <- redistricting_raw %>%
  filter(Level == "Congress", Cycle.Year < 2020) %>%
  mutate(Drawn.by = ifelse(Drawn.by == "", Institution, Drawn.by))

# 3. Identifying states that have revised maps (FL, NC, OH, PA, RI, TX, VA)
problem_states <- redistricting_raw %>%
  count(State) %>%
  filter(n > 2) %>%
  select(State) %>%
  pull(State)

# 4. Creating an additional data frame for pairing census cycles to election years
cycle_pairs <- rbind(
  data.frame(Cycle.Year = 2000, Year = c(2002, 2004, 2006, 2008, 2010)),
  data.frame(Cycle.Year = 2010, Year = c(2012, 2014, 2016, 2018, 2020)))

# 5. Doing a full_join on the data frames to create all possible pairings
# 6. Adding a dummy variable for November 1st of each year as a cutoff point to mark election dates
redistricting_joined <- full_join(redistricting_raw, cycle_pairs, by = "Cycle.Year") %>%
  mutate(Election.Cutoff = paste(Year, "-11-01", sep = ""),
         Start.Date = ifelse((Start.Date == "") & (Seats == 1), "2001-12-31", Start.Date),
         End.Date = ifelse((End.Date == "") & (Seats == 1), "2001-12-31", End.Date))

# 7. Using the above to create another dummy variable to mark election year-map pairings where a map was inactive and filter them out
redistricting <- redistricting_joined %>%
  mutate(Inactive = ifelse(!(State %in% problem_states), F,
                    ifelse(difftime(redistricting_joined$Start.Date, Election.Cutoff) > 0 |
                           difftime(redistricting_joined$End.Date, Election.Cutoff) < 0, T, F))) %>%
  filter(Inactive == FALSE)

# 8. Adding a dummy variable for which states used more than one redistricting method in the sample window
redistricting_changes = redistricting %>%
  group_by(State) %>%
  summarize(redistricting_methods = n_distinct(Drawn.by)) %>%
  mutate(changed = ifelse(redistricting_methods > 1, 1, 0))

redistricting <- left_join(redistricting, redistricting_changes)

# Example showing the successful isolation of map revisions and accurate pairings
redistricting %>%
  filter(State %in% problem_states) %>%
  select(State, Drawn.by, Year, Start.Date, End.Date, changed)
# 9. Adding a variety of dichotomous dummy variables to identify different redistricting methods for later analysis
redistricting <- redistricting %>%
  mutate(legislature = ifelse(Drawn.by == "Legislature", 1, 0),
         partisan = ifelse((Drawn.by == "Legislature") |
                           (Drawn.by == "Politician commission") , 1, 0),
         independent_commission = ifelse(Drawn.by == "Independent commission", 1, 0),
         court = ifelse(str_detect(tolower(Drawn.by), "court"), 1, 0),
         mid_term = ifelse(Year %% 4 == 2, 1, 0))

redistricting %>%
  select(Year, State, Drawn.by, changed, legislature, partisan,
         independent_commission, court, mid_term)

Election results

The MIT election lab data was more granular than necessary for our analysis, containing state-election year-district-candidate observations with vote totals for each candidate. It was aggregated to the district level, pivoted wider to create state-election year-district observations, and then merged with the redistricting data. Third party candidates were grouped as “OTHER” for the aggregation, and Minnesota’s Democratic party was renamed to match the national party.

elec <- read.csv("./source_data/1976-2020-house.csv")

votes <- elec %>%
  filter(year >= 2002) %>%
  mutate(party = ifelse(party == "REPUBLICAN", "REPUBLICAN",
                 ifelse(party == "DEMOCRAT", "DEMOCRAT",
                 ifelse(str_detect(party, "DEMOCRATIC-FARM"), "DEMOCRAT",
                        "OTHER")))) %>%
  group_by(year, state_po, district, party) %>%
  summarize(votes = sum(candidatevotes), totalvotes = mean(totalvotes)) %>%
  pivot_wider(names_from = party, values_from = votes, values_fill = 0) %>%
  left_join(redistricting, by = c("year"="Year", "state_po"="State")) %>% 
  mutate(dem_margin = 100*(DEMOCRAT-REPUBLICAN)/totalvotes)

write.csv(votes, "votes_redistricting.csv", row.names=FALSE)
votes


Census turnout Data

This data set contained State-election year-voter Keep only for required span - 2000 to 2020 Drop states with only one Congressional district. Then, convert the party variable to have only three categories - Republican, Democrat and Other. Pivot to wide, and get vote totals and shares.

Census data was available for the current census and historical data (separated). Current data is combined, historical data is by presidential/midterms. We import the relevant cells from the Excel sheets and merge them as needed.

## New names:
## New names:
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`

Merge Census and redistricting data

redistricting <- left_join(redistricting, states,
                           by = c("State" = "state_po")) %>%
                 mutate(Cycle.Year = as.numeric(Cycle.Year),
                        state_full = tolower(state_full))

redist_turn <- left_join(redistricting, all_years_long,
                         by = c("state_full" = "turnout_state", "Year" = "turnout_year"))

Download census data using API - state boundaries

https://rconsortium.github.io/censusguide/r-packages-all.html

https://walker-data.com/tidycensus/articles/spatial-data.html

The following code uses the tidycensus package and the Census API to download data + shapefiles from the US Census.

#Declare parameters - API key, variables, time

census_api_key("2f1473c692f61175605ea04cbe2a9a1b41d5bf7c")

yr_select <- c(2010)

#call DF

stpop <- get_decennial(geography = "state", 
                        variables = "P003001", 
                         geometry = TRUE,
                        year = yr_select) %>% select(-variable) %>%
  filter(NAME!="Puerto Rico", NAME!="Alaska", NAME!="Hawaii" )
  
stpop$NAME <- tolower(stpop$NAME) 

Merge both dataframes

Create a spatial dataframe by merging both datasets.

election_map<-merge(stpop, redist_turn, by.x = "NAME", by.y = "state_full" )

Descriptive/graphical analysis

Graph 1: State redistricting policies

We filter the dataframe to a single year, say, 2012. Then we use ‘ggplot’ with the ‘geom_sf’ function to make a choropleth map, by unordered categorical variable ‘Drawn.by’. This variable includes a string indicating which type of redistricting policy was in effect in a state.

#idvars = c("NAME", "GEOID", "geometry")

# Visualization

##Maps

map2012 <- election_map %>% filter(Year==2012)

## democratic votes

redist_map <- map2012 %>% 
  ggplot() + 
  geom_sf(aes(fill=Drawn.by), color="black", size=0.1) +
    geom_sf_text(aes(label = State), family = "Times", size = 2.6,
               color = "black", alpha = 0.9, hjust = 0.5, vjust = 0.5,
               label.padding = unit(0, "pt")) +
  labs(title = "Redistricting policies in 2012")+
  theme(plot.title =element_text(hjust=0.5, face="bold"), #moves title to the middle of graph, bolds it
          text= element_text(family= "Times", size = )) +
   scale_fill_manual(values = c("#80b1d3", "#ffffb3", "#98FB98", "#fb8072", "#8dd3c7" , "#436EEE"))
redist_map

#ggsave("./images/redist_map_2012.png", redist_map)

Graph 2: Boxplot of average turnout

The following makes a boxplot depicting level of turnout for all elections, categorised by the state’s redistricting policy. It shows that politican commision has the lowest mean. Next, legislature-drawn states saw the lowest turnout values. Thus, the lowest values of turnout are seen by partisan mechanisms.

#Filter out single district states, and only till 2021.
redist2 <- redist_turn %>% filter(Drawn.by!="n/a") %>%
  filter(Seats != 1,
         Year < 2022)


# Boxplot

##Make a smaller DF of average turnout by redistricting policy
turnout_by_drawnby <- redist2 %>%
  group_by(Drawn.by) %>%
  summarise(turnout_perc = mean(turnout_perc))

#now make boxplot
#TODO: add coloring same as US map coloring
boxplt<- redist2 %>%
  ggplot(aes(x = Drawn.by, y = turnout_perc)) +
  geom_boxplot() +
    geom_text(data = turnout_by_drawnby,
            aes(x = Drawn.by, y = turnout_perc, label = round(turnout_perc, 0)),
            color = "red",
            size = 3,
            hjust = 0.5,
            vjust = 0.5
          #  position = position_dodge(width=1)
            ) +
    theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip() +
  labs(title = "Avg. Turnout from 2000-2020, by Category",
       x = "",
       y = "Percent of eligible voters who voted") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.title = element_text(face =))

#ggsave("./images/boxplot.png", boxplt)
boxplt

Graph 3: Lineplot of average turnout

By redistricting policy

Following lineplot shows the trends of average voter turnout percentages under each drawing institiution through 2000-2020.

# Aggregate to institution
inst_lineplt <- election_map %>% group_by(Drawn.by, Year) %>% summarise(avg_turnout = mean(turnout_perc)) %>% filter(Drawn.by!="n/a")

lesscom <- inst_lineplt %>% filter(Drawn.by %in% c("State court", "Federal court", "Legislature", "Independent commission"))

lineplt <- lesscom  %>%
   ggplot(aes(x=Year, y = avg_turnout, colour = Drawn.by)) + 
  geom_line(size = .7) + 
  labs(title = "Avg. Turnout percentage 2000-2020, by redistricting ", x = "year", y = "Turnout percentage") +
  theme(plot.title = element_text(hjust = 0.5, size = 11))

lineplt

#ggsave("./images/lineplt_trn.html", lineplt)

Regression model: Ordinary Least Squares

We run three regression models at the midterm, presidential and overall levels using the same basic model.

Model 1: Full-sample, Midterm dummy

Our main independent variable derives from the ‘Institution’ or ‘Drawn.By’ columns. For the first set of OLS results, we run three models.

\(y_{i} = \beta_0 + \beta_1 Policy_{i} + \beta_2 Midterm_{i} + \epsilon_{i}\)

turn_reg_all <- lm(turnout_perc ~ Drawn.by + mid_term, data = redist2)

Model 2: Midterms vs Presidential sample, no midterm dummy

Run the above model without the midterm dummy, separately for midterm elections and presidential elections. We use the ‘mid_term’ indicator variable generated.

\(y_{i} = \beta_0 + \beta_1 Policy_{i} + \epsilon_{i}\)

#Subset to midterms, and estimate the model
redist_mid <- redist2 %>% filter(mid_term==1)
Midterm <- lm(turnout_perc ~ Drawn.by+ mid_term, data = redist_mid)

#Subset to Presidential elections, and estimate the model

redist_pres <- redist2 %>% filter(mid_term==0)
Presidential <- lm(turnout_perc ~ Drawn.by + mid_term, data = redist_pres)

OLS Regression: Results

Estimates table

We use stargazer to generate a formatted table of the regression coefficients for the three estimates models. The result shows that midterm has lower turnouts. Politican commission has the most negative significant effects, while state court has the most positive significant effects.

stargazer(turn_reg_all, Midterm, Presidential, column.labels = c("Overall", "Midterm", "Presidential"), type ="text")

==================================================================================================== Dependent variable:
——————————————————————— turnout_perc
Overall Midterm Presidential
(1) (2) (3)
—————————————————————————————————- Drawn.byFederal court 2.594 2.178 3.096
(1.729) (2.358) (2.601)

Drawn.byIndependent commission -0.099 -0.429 0.321
(1.740) (2.375) (2.613)

Drawn.byLegislature 2.019 1.883 2.253
(1.563) (2.087) (2.399)

Drawn.byPolitician commission -4.606** -6.170** -2.945
(1.879) (2.583) (2.797)

Drawn.byState court 3.147* 3.313 3.078
(1.657) (2.237) (2.516)

mid_term -5.138***
(0.468)

Constant 70.343*** 65.400*** 70.050***
(1.559) (2.042) (2.364)

Observations 430 215 215 R2 0.285 0.132 0.075 Adjusted R2 0.275 0.111 0.053 Residual Std. Error 4.850 (df = 423) 5.002 (df = 209) 4.727 (df = 209) F Statistic 28.142*** (df = 6; 423) 6.329*** (df = 5; 209) 3.379*** (df = 5; 209) ==================================================================================================== Note: p<0.1; p<0.05; p<0.01
partisan -0.969** (0.491)
court 1.260** (0.514)

State fixed effects Yes Yes
Year fixed effects Yes Yes
Observations 430 430
R2 0.010 0.016
Adjusted R2 -0.126 -0.120
F Statistic (df = 1; 377) 3.893** 6.010**
====================================================== Note: p<0.1; p<0.05; p<0.01

plm1

Model Formula: turnout_perc ~ partisan + mid_term

Coefficients: partisan -0.96919

Part - 2: Analysis of vote shares

Merge Vote share data

This section adds the elections data, and performs basic cleaning/transformation to merge it with the previously made redistricting data frame. This produces a new data frame with district-level observations, and each row containing the number of votes for each candidate (Republican/Democrat/Other), as well as the redistricting method for that election.

Note: Minnesota’s Democratic party is known as the “Democratic-Farmer-Labor” party and required special handling.

elec <- read.csv("./source_data/1976-2020-house.csv")

votes <- elec %>%
  filter(year >= 2002) %>%
  mutate(party = ifelse(party == "REPUBLICAN", "REPUBLICAN",
                 ifelse(party == "DEMOCRAT", "DEMOCRAT",
                 ifelse(str_detect(party, "DEMOCRATIC-FARM"), "DEMOCRAT",
                        "OTHER")))) %>%
  group_by(year, state_po, district, party) %>%
  summarize(votes = sum(candidatevotes), totalvotes = mean(totalvotes)) %>%
  pivot_wider(names_from = party, values_from = votes, values_fill = 0) %>%
  left_join(redist2, by = c("year"="Year", "state_po"="State"))

votes

The visualization filters the results to elections in states with more than one district, and also excludes races where a Democrat or Republican received zero votes (i.e. they were not on the ballot).

votes %>%
  filter(DEMOCRAT > 0, REPUBLICAN > 0, Seats > 1) %>%
  mutate(winner = ifelse(REPUBLICAN > DEMOCRAT, "Republican", "Democrat")) %>%
  ggplot(aes(x = DEMOCRAT, y = REPUBLICAN)) +
  geom_point(alpha = 0.1) +
  scale_y_continuous(labels = label_comma(suffix = "k", scale = 0.001), limits = c(0, 300000)) +
  scale_x_continuous(labels = label_comma(suffix = "k", scale = 0.001), limits = c(0, 300000)) +
  facet_wrap(~Drawn.by)

Proportion of votes and margins

Margins in states

# Dem share

votes <- votes %>% 
  mutate(dem_margin = 100*(DEMOCRAT-REPUBLICAN)/totalvotes)


# Create a new data frame with the avg_margin values grouped by year and Drawn.by

inst_lineplt <- votes %>% group_by(Drawn.by, year) %>% summarise(avg_margin = mean(dem_margin))


inst_lineplt %>% 
   ggplot(aes(x=year, y = avg_margin, colour = Drawn.by)) + 
  geom_line(size = .7) + 
  labs(title = "Vote share margins, by redistricting ", x = "year", y = "Dem Margin in (%)") +
    
  theme(plot.title = element_text(hjust = 0.5))


votes %>% filter(Drawn.by=="Legislature") %>%
  filter(DEMOCRAT > 0, REPUBLICAN > 0, Seats > 1) %>%
  ggplot(aes(x = dem_margin)) +
    geom_histogram(binwidth = 3) +
    geom_vline(xintercept = 0, color = "black") +
    labs(title = "Democratic vote margin (Legislature)",
         x = "Margin (percent)",
         y = "Frequency")